/*
Helper file to get datasets ready for 2022 LPDID loop. 
- create pikscnt datasets for random sampling by category. 
- create analysis datasets by category to shorten the merge. 

[] brackets denote internal census variable names that can't be disclosed 
*/

libname hannah '/projects/users/########/Snapshot2022/IntermediateData';
libname hantmp '/projects/users/########/Snapshot2022/IntermediateData/TempData';
libname ans '/projects/users/########/Snapshot2022/AnalysisData';


/*
create the pikscnt datasets by category. 
*/

/* white */
/*
proc sql;
  create table hantmp.white_pikscnt as select
  a.*, b.*
  from hantmp.rand_pikscnt as a 
    inner join hantmp.pikspine as b on a.pik=b.pik
  order by a.pik;
quit;

data hantmp.white_pikscnt;
  set hantmp.white_pikscnt (keep = pik [county id] white);
  if white=0 then delete;
  drop white;
run;

* drop duplicates - get unique piks and their county IDs;
proc sort data = hantmp.white_pikscnt nodupkey;
  by pik;
run;

* data must be sorted by strata;
proc sort data = hantmp.white_pikscnt;
  by [county id];
run;

* get white subset of analysis data;
proc sql;
  create table ans.white_analysis as select
  a.pik, a.year, a.[county id], a.[annual earnings], a.unemp_instate, a.min_unemp_instate,
  a.unemp_all, a.age, a.plantcap100, a.plantcap80, a.plantcap60, a.plantcap40,
  a.plantcap20, b.*
  from ans.ehf_icf_analysis as a 
    inner join hantmp.white_pikscnt as b on a.pik=b.pik
  order by a.pik, a.year;
quit;
*/

/* black */
proc sql;
  create table hantmp.black_pikscnt as select
  a.*, b.*
  from hantmp.rand_pikscnt as a 
    inner join hantmp.pikspine as b on a.pik=b.pik
  order by a.pik;
quit;

data hantmp.black_pikscnt;
  set hantmp.black_pikscnt (keep = pik [county id] black);
  if black=0 then delete;
  drop black;
run;

* drop duplicates - get unique piks and their county IDs;
proc sort data = hantmp.black_pikscnt nodupkey;
  by pik;
run;

* data must be sorted by strata;
proc sort data = hantmp.black_pikscnt;
  by [county id];
run;

* get black subset of analysis data;
proc sql;
  create table ans.black_analysis as select
  a.pik, a.year, a.[county id], a.[annual earnings], a.unemp_instate, a.min_unemp_instate,
  a.unemp_all, a.age, a.plantcap100, a.plantcap80, a.plantcap60, a.plantcap40,
  a.plantcap20, b.*
  from ans.ehf_icf_analysis as a 
    inner join hantmp.black_pikscnt as b on a.pik=b.pik
  order by a.pik, a.year;
quit;

/* indnat */
proc sql;
  create table hantmp.indnat_pikscnt as select
  a.*, b.*
  from hantmp.rand_pikscnt as a 
    inner join hantmp.pikspine as b on a.pik=b.pik
  order by a.pik;
quit;

data hantmp.indnat_pikscnt;
  set hantmp.indnat_pikscnt (keep = pik [county id] ind_nat);
  if ind_nat=0 then delete;
  drop ind_nat;
run;

* drop duplicates - get unique piks and their county IDs;
proc sort data = hantmp.indnat_pikscnt nodupkey;
  by pik;
run;

* data must be sorted by strata;
proc sort data = hantmp.indnat_pikscnt;
  by [county id];
run;

* get indnat subset of analysis data;
proc sql;
  create table ans.indnat_analysis as select
  a.pik, a.year, a.[county id], a.[annual earnings], a.unemp_instate, a.min_unemp_instate,
  a.unemp_all, a.age, a.plantcap100, a.plantcap80, a.plantcap60, a.plantcap40,
  a.plantcap20, b.*
  from ans.ehf_icf_analysis as a 
    inner join hantmp.indnat_pikscnt as b on a.pik=b.pik
  order by a.pik, a.year;
quit;

/* hisp */
/*
proc sql;
  create table hantmp.hisp_pikscnt as select
  a.*, b.*
  from hantmp.rand_pikscnt as a 
    inner join hantmp.pikspine as b on a.pik=b.pik
  order by a.pik;
quit;

data hantmp.hisp_pikscnt;
  set hantmp.hisp_pikscnt (keep = pik [county id] hispanic);
  if hispanic=0 then delete;
  drop hispanic;
run;

* drop duplicates - get unique piks and their county IDs;
proc sort data = hantmp.hisp_pikscnt nodupkey;
  by pik;
run;

* data must be sorted by strata;
proc sort data = hantmp.hisp_pikscnt;
  by [county id];
run;

* get hisp subset of analysis data;
proc sql;
  create table ans.hisp_analysis as select
  a.pik, a.year, a.[county id], a.[annual earnings], a.unemp_instate, a.min_unemp_instate,
  a.unemp_all, a.age, a.plantcap100, a.plantcap80, a.plantcap60, a.plantcap40,
  a.plantcap20, b.*
  from ans.ehf_icf_analysis as a 
    inner join hantmp.hisp_pikscnt as b on a.pik=b.pik
  order by a.pik, a.year;
quit;
*/

/* male */
proc sql;
  create table hantmp.male_pikscnt as select
  a.*, b.*
  from hantmp.rand_pikscnt as a 
    inner join hantmp.pikspine as b on a.pik=b.pik
  order by a.pik;
quit;

data hantmp.male_pikscnt;
  set hantmp.male_pikscnt (keep = pik [county id] female);
  if female=1 then delete;
  drop female;
run;

* drop duplicates - get unique piks and their county IDs;
proc sort data = hantmp.male_pikscnt nodupkey;
  by pik;
run;

* data must be sorted by strata;
proc sort data = hantmp.male_pikscnt;
  by [county id];
run;

/*
* get male subset of analysis data;
proc sql;
  create table ans.male_analysis as select
  a.pik, a.year, a.[county id], a.[annual earnings], a.unemp_instate, a.min_unemp_instate,
  a.unemp_all, a.age, a.plantcap100, a.plantcap80, a.plantcap60, a.plantcap40,
  a.plantcap20, b.*
  from ans.ehf_icf_analysis as a 
    inner join hantmp.male_pikscnt as b on a.pik=b.pik
  order by a.pik, a.year;
quit;
*/

/* female */
proc sql;
  create table hantmp.female_pikscnt as select
  a.*, b.*
  from hantmp.rand_pikscnt as a 
    inner join hantmp.pikspine as b on a.pik=b.pik
  order by a.pik;
quit;

data hantmp.female_pikscnt;
  set hantmp.female_pikscnt (keep = pik [county id] female);
  if female=0 then delete;
  drop female;
run;

* drop duplicates - get unique piks and their county IDs;
proc sort data = hantmp.female_pikscnt nodupkey;
  by pik;
run;

* data must be sorted by strata;
proc sort data = hantmp.female_pikscnt;
  by [county id];
run;

/*
* get female subset of analysis data;
proc sql;
  create table ans.female_analysis as select
  a.pik, a.year, a.[county id], a.[annual earnings], a.unemp_instate, a.min_unemp_instate,
  a.unemp_all, a.age, a.plantcap100, a.plantcap80, a.plantcap60, a.plantcap40,
  a.plantcap20, b.*
  from ans.ehf_icf_analysis as a 
    inner join hantmp.female_pikscnt as b on a.pik=b.pik
  order by a.pik, a.year;
quit;
*/

/* nohigh */
proc sql;
  create table hantmp.nohigh_pikscnt as select
  a.*, b.*
  from hantmp.rand_pikscnt as a 
    inner join hantmp.pikspine as b on a.pik=b.pik
  order by a.pik;
quit;

data hantmp.nohigh_pikscnt;
  set hantmp.nohigh_pikscnt (keep = pik [county id] nohigh);
  if nohigh=0 then delete;
  drop nohigh;
run;

* drop duplicates - get unique piks and their county IDs;
proc sort data = hantmp.nohigh_pikscnt nodupkey;
  by pik;
run;

* data must be sorted by strata;
proc sort data = hantmp.nohigh_pikscnt;
  by [county id];
run;

* get nohigh subset of analysis data;
proc sql;
  create table ans.nohigh_analysis as select
  a.pik, a.year, a.[county id], a.[annual earnings], a.unemp_instate, a.min_unemp_instate,
  a.unemp_all, a.age, a.plantcap100, a.plantcap80, a.plantcap60, a.plantcap40,
  a.plantcap20, b.*
  from ans.ehf_icf_analysis as a 
    inner join hantmp.nohigh_pikscnt as b on a.pik=b.pik
  order by a.pik, a.year;
quit;

/* highsch */
proc sql;
  create table hantmp.highsch_pikscnt as select
  a.*, b.*
  from hantmp.rand_pikscnt as a 
    inner join hantmp.pikspine as b on a.pik=b.pik
  order by a.pik;
quit;

data hantmp.highsch_pikscnt;
  set hantmp.highsch_pikscnt (keep = pik [county id] highsch);
  if highsch=0 then delete;
  drop highsch;
run;

* drop duplicates - get unique piks and their county IDs;
proc sort data = hantmp.highsch_pikscnt nodupkey;
  by pik;
run;

* data must be sorted by strata;
proc sort data = hantmp.highsch_pikscnt;
  by [county id];
run;

* get highsch subset of analysis data;
proc sql;
  create table ans.highsch_analysis as select
  a.pik, a.year, a.[county id], a.[annual earnings], a.unemp_instate, a.min_unemp_instate,
  a.unemp_all, a.age, a.plantcap100, a.plantcap80, a.plantcap60, a.plantcap40,
  a.plantcap20, b.*
  from ans.ehf_icf_analysis as a 
    inner join hantmp.highsch_pikscnt as b on a.pik=b.pik
  order by a.pik, a.year;
quit;

/* somecoll */
proc sql;
  create table hantmp.somecoll_pikscnt as select
  a.*, b.*
  from hantmp.rand_pikscnt as a 
    inner join hantmp.pikspine as b on a.pik=b.pik
  order by a.pik;
quit;

data hantmp.somecoll_pikscnt;
  set hantmp.somecoll_pikscnt (keep = pik [county id] somecoll);
  if somecoll=0 then delete;
  drop somecoll;
run;

* drop duplicates - get unique piks and their county IDs;
proc sort data = hantmp.somecoll_pikscnt nodupkey;
  by pik;
run;

* data must be sorted by strata;
proc sort data = hantmp.somecoll_pikscnt;
  by [county id];
run;

* get somecoll subset of analysis data;
proc sql;
  create table ans.somecoll_analysis as select
  a.pik, a.year, a.[county id], a.[annual earnings], a.unemp_instate, a.min_unemp_instate,
  a.unemp_all, a.age, a.plantcap100, a.plantcap80, a.plantcap60, a.plantcap40,
  a.plantcap20, b.*
  from ans.ehf_icf_analysis as a 
    inner join hantmp.somecoll_pikscnt as b on a.pik=b.pik
  order by a.pik, a.year;
quit;

/* college */
proc sql;
  create table hantmp.college_pikscnt as select
  a.*, b.*
  from hantmp.rand_pikscnt as a 
    inner join hantmp.pikspine as b on a.pik=b.pik
  order by a.pik;
quit;

data hantmp.college_pikscnt;
  set hantmp.college_pikscnt (keep = pik [county id] college);
  if college=0 then delete;
  drop college;
run;

* drop duplicates - get unique piks and their county IDs;
proc sort data = hantmp.college_pikscnt nodupkey;
  by pik;
run;

* data must be sorted by strata;
proc sort data = hantmp.college_pikscnt;
  by [county id];
run;

* get college subset of analysis data;
proc sql;
  create table ans.college_analysis as select
  a.pik, a.year, a.[county id], a.[annual earnings], a.unemp_instate, a.min_unemp_instate,
  a.unemp_all, a.age, a.plantcap100, a.plantcap80, a.plantcap60, a.plantcap40,
  a.plantcap20, b.*
  from ans.ehf_icf_analysis as a 
    inner join hantmp.college_pikscnt as b on a.pik=b.pik
  order by a.pik, a.year;
quit;

